# -*- coding: utf-8 -*-
from lxml import etree
import asyncio
import aiohttp
import pymysql
import time


class douyin_crawl:
    def __init__(self, url, conn):
        self.conn = conn
        self.cur = self.conn.cursor()
        self.url = url
        self.header = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'
        }

    def parse_Iconfont(self, dy):
        '''将图标字体转化为字体'''
        dict_Iconfont = {r' \ue603 ': 0, r' \ue60d ': 0, r' \ue616 ': 0,
                         r' \ue602 ': 1, r' \ue60e ': 1, r' \ue618 ': 1,
                         r' \ue605 ': 2, r' \ue610 ': 2, r' \ue617 ': 2,
                         r' \ue604 ': 3, r' \ue611 ': 3, r' \ue61a ': 3,
                         r' \ue606 ': 4, r' \ue60c ': 4, r' \ue619 ': 4,
                         r' \ue607 ': 5, r' \ue60f ': 5, r' \ue61b ': 5,
                         r' \ue608 ': 6, r' \ue612 ': 6, r' \ue61f ': 6,
                         r' \ue60a ': 7, r' \ue613 ': 7, r' \ue61c ': 7,
                         r' \ue60b ': 8, r' \ue614 ': 8, r' \ue61d ': 8,
                         r' \ue609 ': 9, r' \ue615 ': 9, r' \ue61e ': 9, }
        return dict_Iconfont[dy]

    async def crawl_douyin(self):
        try:
            async with aiohttp.ClientSession() as session:
                try:
                    async with session.get(url = self.url, headers = self.header)as r:
                        if r.status == 200:
                            req = await r.text()
                            data = etree.HTML(req)  # 用xpath来采集数据
                            userName = data.xpath('//*[@id="pagelet-user-info"]/div[2]/div[1]/p[1]/text()')[0]
                            douyin_ID_part1 = data.xpath('//*[@id="pagelet-user-info"]/div[2]/div[1]/p[2]/text()')[0][
                                              10:]
                            name_part = ""
                            try:
                                douyin_ID_part2 = data.xpath('//*[@id="pagelet-user-info"]/div[2]/div[1]/p[2]/i/text()')
                                for i in douyin_ID_part2:
                                    name_part += str(self.parse_Iconfont(repr(i)[1:-1]))
                            except:
                                pass
                            douyin_ID = int(douyin_ID_part1 + name_part)
                            # print(douyin_ID)
                            sign = data.xpath('//*[@id="pagelet-user-info"]/div[2]/div[2]/p[1]/text()')[0]
                            # print(sign)
                            # 关注数
                            focus_part1_str = ""
                            focus_part1 = data.xpath(
                                '//*[@id="pagelet-user-info"]/div[2]/div[2]/p[2]/span[1]/span[1]/text()')
                            for dot in focus_part1:
                                if dot == '.':
                                    focus_part1_str += dot
                            focus_part2_str = ""
                            focus_part2 = data.xpath(
                                '//*[@id="pagelet-user-info"]/div[2]/div[2]/p[2]/span[1]/span[1]/i/text()')
                            for j in focus_part2:
                                focus_part2_str += str(self.parse_Iconfont(repr(j)[1:-1]))
                            focus = focus_part2_str[:-1] + focus_part1_str + focus_part2_str[-1]
                            # print(focus)
                            # 粉丝数
                            fans_part1_str = ""
                            fans_part1 = data.xpath(
                                '//*[@id="pagelet-user-info"]/div[2]/div[2]/p[2]/span[2]/span[1]/text()')
                            for dot in fans_part1:
                                if dot == '.':
                                    fans_part1_str += dot
                            fans_part2_str = ""
                            fans_part2 = data.xpath(
                                '//*[@id="pagelet-user-info"]/div[2]/div[2]/p[2]/span[2]/span[1]/i/text()')
                            for k in fans_part2:
                                fans_part2_str += str(self.parse_Iconfont(repr(k)[1:-1]))
                            fans = fans_part2_str[:-1] + fans_part1_str + fans_part2_str[-1]
                            # print(fans)
                            # 点赞数
                            like_part1_str = ""
                            like_part1 = data.xpath(
                                '//*[@id="pagelet-user-info"]/div[2]/div[2]/p[2]/span[3]/span[1]/text()')
                            for dot in like_part1:
                                if dot == '.':
                                    like_part1_str += dot
                            like_part2_str = ""
                            like_part2 = data.xpath(
                                '//*[@id="pagelet-user-info"]/div[2]/div[2]/p[2]/span[3]/span[1]/i/text()')
                            for h in like_part2:
                                like_part2_str += str(self.parse_Iconfont(repr(h)[1:-1]))
                            like = like_part2_str[:-1] + like_part1_str + like_part2_str[-1]
                            # print(like)
                            # 作品
                            works_part1_str = ""
                            works_part1 = data.xpath('//*[@id="pagelet-user-info"]/div[3]/div/div[1]/span/text()')
                            for dot in works_part1:
                                if dot == '.':
                                    works_part1_str += dot
                            works_part2_str = ""
                            works_part2 = data.xpath('//*[@id="pagelet-user-info"]/div[3]/div/div[1]/span/i/text()')
                            for l in works_part2:
                                works_part2_str += str(self.parse_Iconfont(repr(l)[1:-1]))
                            works = works_part2_str[:-1] + works_part1_str + works_part2_str[-1]
                            # print(works)
                            # 喜欢数
                            authorlike_part1_str = ""
                            authorlike_part1 = data.xpath('//*[@id="pagelet-user-info"]/div[3]/div/div[2]/span/text()')
                            for dot in authorlike_part1:
                                if dot == '.':
                                    authorlike_part1_str += dot
                            authorlike_part2_str = ""
                            authorlike_part2 = data.xpath(
                                '//*[@id="pagelet-user-info"]/div[3]/div/div[2]/span/i/text()')
                            for l in authorlike_part2:
                                authorlike_part2_str += str(self.parse_Iconfont(repr(l)[1:-1]))
                            authorlike = authorlike_part2_str[:-1] + authorlike_part1_str + authorlike_part2_str[-1]
                            # print(authorlike)
                            t = time.ctime()
                            # data_dict = {'爬取日期': t, '用户名': userName, '抖音ID': douyin_ID, '简介': sign, '关注数': focus,
                            #              '粉丝数': fans, '点赞数': like, '作品数': works, '喜欢数': authorlike}
                            # print(data_dict)
                            try:
                                await self.insert_mysql(t, userName, douyin_ID, sign, focus, fans, like, works,
                                                        authorlike)
                            except:
                                pass
                except:
                    pass
        except:
            pass

    async def insert_mysql(self, t, userName, douyin_ID, sign, focus, fans, like, works, authorlike):
        data_tupe = (t, userName, douyin_ID, sign, focus, fans, like, works, authorlike)
        self.cur.execute(
            'INSERT INTO `抖音用户数据` ( `爬取日期`, `用户名`, `抖音ID`, `简介`, `关注数`, `粉丝数`, `点赞数`, `作品数`, `喜欢数`) VALUES {0}'.format(
                data_tupe))
        self.conn.commit()
        return
        pass


if __name__ == '__main__':
    conn = pymysql.connect("127.0.0.1", "root", "pwd", "抖音用户数据")
    cur = conn.cursor()
    cur.execute('''
            CREATE TABLE IF NOT EXISTS `抖音用户数据`(
            `id` INT UNSIGNED AUTO_INCREMENT,
            `爬取日期` VARCHAR(255) NOT NULL,
            `用户名` VARCHAR(255) NOT NULL,
            `抖音ID` INT NOT NULL,
            `简介` TEXT NOT NULL,
            `关注数` VARCHAR(255) NOT NULL,
            `粉丝数` VARCHAR(255) NOT NULL,
            `点赞数` VARCHAR(255) NOT NULL,
            `作品数` VARCHAR(255) NOT NULL,
            `喜欢数` VARCHAR(255) NOT NULL,
            PRIMARY KEY ( `id` ,`抖音ID`)
)ENGINE=InnoDB DEFAULT CHARSET=utf8;
            ''')
    url = "https://www.iesdouyin.com/share/user/{0}?timestamp=1560602021"
    loop = asyncio.get_event_loop()
    start = 1
    end = 200
    while True:
        try:
            task = [douyin_crawl(url.format(page), conn).crawl_douyin() for page in range(start, end)]
            loop.run_until_complete(asyncio.gather(*task))
            start = end
            end += 200
            task.clear()
        except:
            break
    loop.close()
